{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d2ab063b-3e08-4b35-8eb9-75bdd00f4456",
   "metadata": {},
   "source": [
    "In this notebook we will show how to use Hugginface's tokenizers and models as they are integrated within the library. In notebook number 17 you can find examples on how to code your own, custom, Hugginface (hereafter HF) model and use it in combination of any other model in the library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "76722325",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/javierrodriguezzaurin/.pyenv/versions/3.10.13/envs/widedeep310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import f1_score, accuracy_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from pytorch_widedeep import Trainer\n",
    "from pytorch_widedeep.models import HFModel, WideDeep\n",
    "from pytorch_widedeep.metrics import F1Score, Accuracy\n",
    "from pytorch_widedeep.datasets import load_womens_ecommerce\n",
    "from pytorch_widedeep.preprocessing import HFPreprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b3ff23e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df: pd.DataFrame = load_womens_ecommerce(as_frame=True)  # type: ignore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a2af0068",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(23486, 10)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5b595276",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Clothing ID</th>\n",
       "      <th>Age</th>\n",
       "      <th>Title</th>\n",
       "      <th>Review Text</th>\n",
       "      <th>Rating</th>\n",
       "      <th>Recommended IND</th>\n",
       "      <th>Positive Feedback Count</th>\n",
       "      <th>Division Name</th>\n",
       "      <th>Department Name</th>\n",
       "      <th>Class Name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7004</th>\n",
       "      <td>862</td>\n",
       "      <td>43</td>\n",
       "      <td>Cute and feminine</td>\n",
       "      <td>Loved this sweater wrap and bought it in both ...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>General</td>\n",
       "      <td>Tops</td>\n",
       "      <td>Knits</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12508</th>\n",
       "      <td>975</td>\n",
       "      <td>66</td>\n",
       "      <td>Love it</td>\n",
       "      <td>The linen fabric is elegantly thin feels and l...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>General</td>\n",
       "      <td>Jackets</td>\n",
       "      <td>Jackets</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10288</th>\n",
       "      <td>950</td>\n",
       "      <td>41</td>\n",
       "      <td>Perfect for fall</td>\n",
       "      <td>This sweater is just as pictured. the fit is t...</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>General</td>\n",
       "      <td>Tops</td>\n",
       "      <td>Sweaters</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Clothing ID  Age              Title  \\\n",
       "7004           862   43  Cute and feminine   \n",
       "12508          975   66            Love it   \n",
       "10288          950   41   Perfect for fall   \n",
       "\n",
       "                                             Review Text  Rating  \\\n",
       "7004   Loved this sweater wrap and bought it in both ...       5   \n",
       "12508  The linen fabric is elegantly thin feels and l...       5   \n",
       "10288  This sweater is just as pictured. the fit is t...       5   \n",
       "\n",
       "       Recommended IND  Positive Feedback Count Division Name Department Name  \\\n",
       "7004                 1                        2       General            Tops   \n",
       "12508                1                        3       General         Jackets   \n",
       "10288                1                        0       General            Tops   \n",
       "\n",
       "      Class Name  \n",
       "7004       Knits  \n",
       "12508    Jackets  \n",
       "10288   Sweaters  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "11c55682",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's do some mild preprocessing\n",
    "df.columns = [c.replace(\" \", \"_\").lower() for c in df.columns]\n",
    "\n",
    "# classes from [0,num_class)\n",
    "df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")\n",
    "\n",
    "# group reviews with 1 and 2 scores into one class\n",
    "df.loc[df.rating == 0, \"rating\"] = 1\n",
    "\n",
    "# and back again to [0,num_class)\n",
    "df[\"rating\"] = (df[\"rating\"] - 1).astype(\"int64\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fa687d1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop short reviews\n",
    "df = df[~df.review_text.isna()]\n",
    "df[\"review_length\"] = df.review_text.apply(lambda x: len(x.split(\" \")))\n",
    "df = df[df.review_length >= 5]\n",
    "df = df.drop(\"review_length\", axis=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d9f61fb5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(22608, 10)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "115bb5aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "rating\n",
       "3    12515\n",
       "2     4904\n",
       "1     2820\n",
       "0     2369\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if you run this on a CPU, you might want to subsample the dataset. With that in mind I am simply going to stratify-sample to the minimum category occurrence and then sample at random\n",
    "# If you run this on a GPU you can comment out the following two cells\n",
    "df.rating.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "247e1b0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_2/lrjn1qn54c758tdtktr1bvkc0000gn/T/ipykernel_5886/895673206.py:3: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  .apply(lambda x: x.sample(min(len(x), 2369)))\n"
     ]
    }
   ],
   "source": [
    "df = (\n",
    "    df.groupby(\"rating\", group_keys=False)\n",
    "    .apply(lambda x: x.sample(min(len(x), 2369)))\n",
    "    .sample(1000)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4f0323f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(df, train_size=0.8, random_state=1, stratify=df.rating)\n",
    "\n",
    "# possible model names currently supported in the library\n",
    "model_names = [\n",
    "    \"distilbert-base-uncased\",\n",
    "    \"bert-base-uncased\",\n",
    "    \"FacebookAI/roberta-base\",\n",
    "    \"albert-base-v2\",\n",
    "    \"google/electra-base-discriminator\",\n",
    "]\n",
    "\n",
    "# Let's choose one. The syntax is the same for all the models\n",
    "model_name = \"distilbert-base-uncased\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5246e60",
   "metadata": {},
   "source": [
    "Now we can use the `HFPreprocessor` class. As most things in this library, the integration with HF has been coded aiming for a flexible use. With this in mind, there are two ways one can use a `HFPreprocessor` class. \n",
    "\n",
    "1. Passing a `text_col` and `encode_params` as the class is instantiated and then using the `fit` and `transform` as with any other preprocessor in the library\n",
    "2. Without passing  `text_col` and `encode_params` as the class is instantiated and using the `encode` method of the `HFPreprocessor` which is simply a wrapper around the encode method of HF's tokenizers\n",
    "\n",
    "Let's have a look"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "58a12639",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer1 = HFPreprocessor(\n",
    "    model_name=model_name,\n",
    "    text_col=\"review_text\",\n",
    "    num_workers=1,\n",
    "    encode_params={\n",
    "        \"max_length\": 90,\n",
    "        \"padding\": \"max_length\",\n",
    "        \"truncation\": True,\n",
    "        \"add_special_tokens\": True,\n",
    "    },\n",
    ")\n",
    "\n",
    "X_text_tr1 = tokenizer1.fit_transform(train)\n",
    "X_text_te1 = tokenizer1.transform(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f4940133",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer2 = HFPreprocessor(\n",
    "    model_name=model_name,\n",
    "    num_workers=1,\n",
    ")\n",
    "\n",
    "X_text_tr2 = tokenizer2.encode(\n",
    "    train.review_text.tolist(),\n",
    "    max_length=90,\n",
    "    padding=\"max_length\",\n",
    "    truncation=True,\n",
    "    add_special_tokens=True,\n",
    ")\n",
    "X_text_te2 = tokenizer2.encode(\n",
    "    test.review_text.tolist(),\n",
    "    max_length=90,\n",
    "    padding=\"max_length\",\n",
    "    truncation=True,\n",
    "    add_special_tokens=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9f0a7883",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all(X_text_tr1[0] == X_text_tr2[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4567e051",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now we define a model which is as easy as:\n",
    "# Note that this will instantiation will lead to NO parameter trainable in the HF model.\n",
    "# If you want to fine-tune the HF model, you can set the trainable parameters via the 'trainable_parameters' argument.\n",
    "# Alternatively, you can use a head (MLP) via the 'head'-related arguments (see the docs for more details)\n",
    "hf_model = HFModel(model_name=model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0a222ea2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch 1: 100%|██████████| 13/13 [02:06<00:00,  9.75s/it, loss=3.2, metrics={'acc': 0.235, 'f1': 0.2336}]  \n"
     ]
    }
   ],
   "source": [
    "# And from here on is the same as any other WideDeep model\n",
    "model = WideDeep(\n",
    "    deeptext=hf_model,\n",
    "    pred_dim=4,\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    objective=\"multiclass\",\n",
    "    metrics=[Accuracy(), F1Score(average=True)],\n",
    ")\n",
    "\n",
    "trainer.fit(\n",
    "    X_text=X_text_tr2,\n",
    "    target=train.rating.values,\n",
    "    n_epochs=1,\n",
    "    batch_size=64,\n",
    ")\n",
    "# If you run this on a CPU and you sampled the data, the metrics will not be better than a random guess. Remember, this is just a demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "58bbf1fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "predict: 100%|██████████| 4/4 [00:05<00:00,  1.43s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.2500\n",
      "F1: 0.1000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "preds_text = trainer.predict_proba(X_text=X_text_te2)\n",
    "pred_text_class = np.argmax(preds_text, 1)\n",
    "\n",
    "acc_text = accuracy_score(test.rating, pred_text_class)\n",
    "f1_text = f1_score(test.rating, pred_text_class, average=\"weighted\")\n",
    "print(f\"Accuracy: {acc_text:.4f}\")\n",
    "print(f\"F1: {f1_text:.4f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
